1. Data

Load the student scores for the test - here we load the 2018 ETH Zurich test data:

head(test_scores)
## # A tibble: 6 x 38
##   year  class     A1    A2    A3    A4    A5    A6    A7    A8    A9   A10   A11
##   <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2018  401-0~     1     0     1     1     1     1     1     0     0     1     1
## 2 2018  401-0~     1     0     1     1     0     0     1     1     1     0     0
## 3 2018  401-0~     1     1     1     1     1     1     1     0     1     1     1
## 4 2018  401-0~     1     0     0     0     0     0     0     0     0     0     0
## 5 2018  401-0~     0     0     1     0     0     0     0     0     1     1     0
## 6 2018  401-0~     1     1     1     1     0     1     1     1     0     0     0
## # ... with 25 more variables: A12 <dbl>, A13 <dbl>, A14 <dbl>, A15 <dbl>,
## #   A16 <dbl>, A17 <dbl>, A18 <dbl>, A19 <dbl>, A20 <dbl>, A21 <dbl>,
## #   A22 <dbl>, A23 <dbl>, A24 <dbl>, A25 <dbl>, A26 <dbl>, A27 <dbl>,
## #   A28 <dbl>, A29 <dbl>, A30 <dbl>, A31 <dbl>, A32 <dbl>, A33 <dbl>,
## #   A34 <dbl>, A35 <dbl>, A36 <dbl>

Data summary

The number of responses from each class:

test_scores %>% 
  group_by(year, class) %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
class n
2018
401-0131-00L 230
401-0141-00L 124
401-0151-00L 201
401-0231-10L 170
401-0251-00L 163
401-0261-G0L 283
401-0261-GUL 31
401-0261-GXL 314
401-0271-00L 108
401-0281-00L 59
401-0291-00L 235
401-1151-00L 275

Mean and standard deviation for each item:

test_scores %>% 
  select(-class) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  group_by(year) %>% 
  gt() %>% 
  fmt_number(columns = contains("numeric"), decimals = 3) %>%
  data_color(
    columns = c("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  cols_label(
    numeric.mean = "Mean",
    numeric.sd = "SD"
  )
skim_variable n_missing complete_rate Mean SD
2018
A1 0 1 0.964 0.186
A2 0 1 0.293 0.455
A3 0 1 0.654 0.476
A4 0 1 0.644 0.479
A5 0 1 0.502 0.500
A6 0 1 0.745 0.436
A7 0 1 0.731 0.444
A8 0 1 0.519 0.500
A9 0 1 0.488 0.500
A10 0 1 0.556 0.497
A11 0 1 0.610 0.488
A12 0 1 0.728 0.445
A13 0 1 0.377 0.485
A14 0 1 0.586 0.493
A15 0 1 0.485 0.500
A16 0 1 0.205 0.404
A17 0 1 0.632 0.482
A18 0 1 0.368 0.482
A19 0 1 0.343 0.475
A20 0 1 0.742 0.437
A21 0 1 0.536 0.499
A22 0 1 0.541 0.498
A23 0 1 0.463 0.499
A24 0 1 0.610 0.488
A25 0 1 0.748 0.434
A26 0 1 0.834 0.372
A27 0 1 0.422 0.494
A28 0 1 0.481 0.500
A29 0 1 0.534 0.499
A30 0 1 0.749 0.434
A31 0 1 0.394 0.489
A32 0 1 0.212 0.409
A33 0 1 0.782 0.413
A34 0 1 0.624 0.485
A35 0 1 0.339 0.473
A36 0 1 0.234 0.423

2. Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Local independence

This plot shows the correlations between scores on each pair of items:

item_scores <- test_scores %>% 
  select(-class, -year)

cor_ci <- psych::corCi(item_scores, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

There are a few correlations that are not significantly different from 0:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A2-A34 −0.034 0.058 0.615
A1-A2 −0.021 0.056 0.366
A1-A36 −0.013 0.069 0.186
A1-A18 −0.003 0.079 0.070

The overall picture is that the item scores are well correlated with each other.

Dimensionality

structure <- check_factorstructure(item_scores)
n <- n_factors(item_scores)

Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.94).
  • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(630) = 18176.34, p < .001).

Method Agreement Procedure:

The choice of 1 dimensions is supported by 4 (17.39%) methods out of 23 (Acceleration factor, VSS complexity 1, Velicer’s MAP, RMSEA).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 4
2 1
3 1
4 1
5 2
6 4
9 1
11 2
26 1
28 1
33 1
34 1
35 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores, fa = "fa")

## Parallel analysis suggests that the number of factors =  9  and the number of components =  NA

1 Factor

fitfact <- factanal(item_scores, factors = 1, rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 1, rotation = "varimax")
## 
## Uniquenesses:
##   A1   A2   A3   A4   A5   A6   A7   A8   A9  A10  A11  A12  A13  A14  A15  A16 
## 0.95 0.96 0.76 0.82 0.74 0.76 0.82 0.80 0.63 0.69 0.68 0.90 0.92 0.76 0.88 0.81 
##  A17  A18  A19  A20  A21  A22  A23  A24  A25  A26  A27  A28  A29  A30  A31  A32 
## 0.76 0.81 0.80 0.64 0.61 0.66 0.66 0.75 0.93 0.70 0.64 0.68 0.88 0.83 0.80 0.82 
##  A33  A34  A35  A36 
## 0.87 0.88 0.81 0.86 
## 
## Loadings:
##  [1] 0.51 0.61 0.56 0.56 0.60 0.62 0.58 0.58 0.55 0.60 0.57           0.49 0.43
## [16] 0.49 0.42 0.44 0.31      0.49 0.35 0.43 0.49 0.44 0.45 0.50      0.35 0.41
## [31] 0.45 0.43 0.36 0.35 0.44 0.38
## 
##                Factor1
## SS loadings       7.75
## Proportion Var    0.22
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 3903.11 on 594 degrees of freedom.
## The p-value is 0
load <- tidy(fitfact)

ggplot(load, aes(x = fl1, y = 0)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("A", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = NULL,
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# To proceed with the IRT analysis, comment out the following line before knitting
#knitr::knit_exit()

3. Fitting 2 parameter logistic MIRT model

We can fit a Multidimensional Item Response Theory (mirt) model. From the function definition:

mirt fits a maximum likelihood (or maximum a posteriori) factor analysis model to any mixture of dichotomous and polytomous data under the item response theory paradigm using either Cai's (2010) Metropolis-Hastings Robbins-Monro (MHRM) algorithm.

The process is to first fit the model, and save the result as a model object that we can then parse to get tabular or visual displays of the model that we might want. When fitting the model, we have the option to specify a few arguments, which then get interpreted as parameters to be passed to the model.

fit_2pl <- mirt(
  data = item_scores, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "2PL",   # 2 parameter logistic model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -42537.007, Max-Change: 0.80697
Iteration: 2, Log-Lik: -41857.079, Max-Change: 0.42644
Iteration: 3, Log-Lik: -41765.562, Max-Change: 0.39313
Iteration: 4, Log-Lik: -41733.247, Max-Change: 0.13739
Iteration: 5, Log-Lik: -41714.377, Max-Change: 0.08694
Iteration: 6, Log-Lik: -41703.599, Max-Change: 0.06036
Iteration: 7, Log-Lik: -41697.345, Max-Change: 0.05323
Iteration: 8, Log-Lik: -41693.703, Max-Change: 0.03370
Iteration: 9, Log-Lik: -41691.469, Max-Change: 0.03145
Iteration: 10, Log-Lik: -41689.421, Max-Change: 0.01847
Iteration: 11, Log-Lik: -41688.919, Max-Change: 0.01498
Iteration: 12, Log-Lik: -41688.609, Max-Change: 0.01278
Iteration: 13, Log-Lik: -41688.333, Max-Change: 0.00703
Iteration: 14, Log-Lik: -41688.246, Max-Change: 0.00468
Iteration: 15, Log-Lik: -41688.188, Max-Change: 0.00459
Iteration: 16, Log-Lik: -41688.115, Max-Change: 0.00298
Iteration: 17, Log-Lik: -41688.094, Max-Change: 0.00173
Iteration: 18, Log-Lik: -41688.087, Max-Change: 0.00123
Iteration: 19, Log-Lik: -41688.075, Max-Change: 0.00113
Iteration: 20, Log-Lik: -41688.071, Max-Change: 0.00065
Iteration: 21, Log-Lik: -41688.068, Max-Change: 0.00077
Iteration: 22, Log-Lik: -41688.064, Max-Change: 0.00052
Iteration: 23, Log-Lik: -41688.062, Max-Change: 0.00041
Iteration: 24, Log-Lik: -41688.061, Max-Change: 0.00023
Iteration: 25, Log-Lik: -41688.060, Max-Change: 0.00025
Iteration: 26, Log-Lik: -41688.059, Max-Change: 0.00024
Iteration: 27, Log-Lik: -41688.058, Max-Change: 0.00022
Iteration: 28, Log-Lik: -41688.056, Max-Change: 0.00016
Iteration: 29, Log-Lik: -41688.055, Max-Change: 0.00016
Iteration: 30, Log-Lik: -41688.055, Max-Change: 0.00015
Iteration: 31, Log-Lik: -41688.054, Max-Change: 0.00012
Iteration: 32, Log-Lik: -41688.054, Max-Change: 0.00012
Iteration: 33, Log-Lik: -41688.054, Max-Change: 0.00012
Iteration: 34, Log-Lik: -41688.053, Max-Change: 0.00010
## 
## Calculating information matrix...

We then compute factor score estimates and augment the existing data frame with these estimates, to keep everything in one place. To do the estimation, we use the fscores() function from the mirt package which takes in a computed model object and computes factor score estimates according to the method specified. We will use the EAP method for factor score estimation, which is the “expected a-posteriori” method, the default. We specify it explicitly below, but the results would have been the same if we omitted specifying the method argument since it’s the default method the function uses.

test_scores <- test_scores %>%
  mutate(F1 = fscores(fit_2pl, method = "EAP"))

We can also calculate the model coefficient estimates using a generic function coef() which is used to extract model coefficients from objects returned by modeling functions. We will set the IRTpars argument to TRUE, which means slope intercept parameters will be converted into traditional IRT parameters.

coefs_2pl <- coef(fit_2pl, IRTpars = TRUE)

The resulting object coefs is a list, with one element for each question, and an additional GroupPars element that we won’t be using. The output is a bit long, so we’re only showing a few of the elements here:

coefs_2pl[1:3]
## $A1
##                a         b  g  u
## par     1.277290 -3.136814  0  1
## CI_2.5  1.003766 -3.629126 NA NA
## CI_97.5 1.550813 -2.644502 NA NA
## 
## $A2
##                 a        b  g  u
## par     0.4686183 1.969524  0  1
## CI_2.5  0.3634940 1.517563 NA NA
## CI_97.5 0.5737426 2.421485 NA NA
## 
## $A3
##                a          b  g  u
## par     1.320866 -0.6360125  0  1
## CI_2.5  1.172386 -0.7343667 NA NA
## CI_97.5 1.469346 -0.5376583 NA NA
# coefs_2pl[35:37]

Let’s take a closer look at the first element:

coefs_2pl[1]
## $A1
##                a         b  g  u
## par     1.277290 -3.136814  0  1
## CI_2.5  1.003766 -3.629126 NA NA
## CI_97.5 1.550813 -2.644502 NA NA

In this output:

  • a is discrimination
  • b is difficulty
  • endpoints of the 95% confidence intervals are also shown

To make this output a little more user friendly, we should tidy it such that we have a row per question. We’ll do this in two steps. First, write a function that tidies the output for one question, i.e. one list element. Then, map this function over the list of all questions, resulting in a data frame.

tidy_mirt_coefs <- function(x){
  x %>%
    # melt the list element
    melt() %>%
    # convert to a tibble
    as_tibble() %>%
    # convert factors to characters
    mutate(across(where(is.factor), as.character)) %>%
    # only focus on rows where X2 is a or b (discrimination or difficulty)
    filter(X2 %in% c("a", "b")) %>%
    # in X1, relabel par (parameter) as est (estimate)
    mutate(X1 = if_else(X1 == "par", "est", X1)) %>%
    # unite columns X2 and X1 into a new column called var separated by _
    unite(X2, X1, col = "var", sep = "_") %>%
    # turn into a wider data frame
    pivot_wider(names_from = var, values_from = value)
}

Let’s see what this does to a single element in coefs:

tidy_mirt_coefs(coefs_2pl[1])
## # A tibble: 1 x 7
##   L1    a_est a_CI_2.5 a_CI_97.5 b_est b_CI_2.5 b_CI_97.5
##   <chr> <dbl>    <dbl>     <dbl> <dbl>    <dbl>     <dbl>
## 1 A1     1.28     1.00      1.55 -3.14    -3.63     -2.64

And now let’s map it over all 32 elements of coefs:

tidy_2pl <- map_dfr(coefs_2pl[1:32], tidy_mirt_coefs, .id = "Question")

A quick peek at the result:

tidy_2pl
## # A tibble: 32 x 7
##    Question a_est a_CI_2.5 a_CI_97.5    b_est b_CI_2.5 b_CI_97.5
##    <chr>    <dbl>    <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 A1       1.28     1.00      1.55  -3.14     -3.63     -2.64  
##  2 A2       0.469    0.363     0.574  1.97      1.52      2.42  
##  3 A3       1.32     1.17      1.47  -0.636    -0.734    -0.538 
##  4 A4       1.09     0.955     1.22  -0.671    -0.785    -0.557 
##  5 A5       1.41     1.26      1.56  -0.00621  -0.0875    0.0751
##  6 A6       1.50     1.33      1.67  -0.989    -1.10     -0.881 
##  7 A7       1.15     1.01      1.29  -1.08     -1.22     -0.949 
##  8 A8       1.11     0.979     1.24  -0.0839   -0.178     0.0106
##  9 A9       1.97     1.77      2.17   0.0409   -0.0279    0.110 
## 10 A10      1.64     1.47      1.81  -0.200    -0.276    -0.124 
## # ... with 22 more rows

And a nicely formatted table of the result:

gt(tidy_2pl) %>%
  fmt_number(columns = contains("_"), decimals = 3) %>%
  data_color(
    columns = contains("a_"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("b_"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = contains("b_")) %>%
  cols_label(
    a_est = "Est.",
    b_est = "Est.",
    a_CI_2.5 = "2.5%",
    b_CI_2.5 = "2.5%",
    a_CI_97.5 = "97.5%",
    b_CI_97.5 = "97.5%"
  )
tidy_2pl %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(x = qnum, y = b_est)) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0.2) +
  geom_point() +
  theme_minimal() +
  labs(x = "Question",
       y = "Difficulty")

Comparing years and classes

Do students from different programmes of study have different distributions of ability?

Differences between years

Compare the distribution of abilities in the year groups (though in this case there is only one).

ggplot(test_scores, aes(F1, fill = as.factor(year), colour = as.factor(year))) +
  geom_density(alpha=0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by year of taking the test", 
       x = "Ability", y = "Density",
       fill = "Year", colour = "Year") +
  theme_minimal()

Differences between classes

Compare the distribution of abilities in the various classes.

ggplot(test_scores, aes(x = F1, y = class, colour = class, fill = class)) +
  geom_density_ridges(alpha = 0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  guides(fill = FALSE, colour = FALSE) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by class of taking the test", 
       x = "Ability", y = "Class") +
  theme_minimal()
## Picking joint bandwidth of 0.286

Information curves

Test information curve

plot(fit_2pl, type = "infoSE", main = "Test information")

Item information curves

plot(fit_2pl, type = "infotrace", main = "Item information curves")

Response curves

Test response curves

plot(fit_2pl, type = "score", auto.key = FALSE)

Item response curves

We can get individual item surface and information plots using the itemplot() function from the mirt package, e.g.

mirt::itemplot(fit_2pl, item = 1, 
               main = "Trace lines for item 1")

We can also get the plots for all trace lines, one facet per plot.

plot(fit_2pl, type = "trace", auto.key = FALSE)

Or all of them overlaid in one plot.

plot(fit_2pl, type = "trace", facet_items=FALSE)

An alternative approach is using ggplot2 and plotly to add interactivity to make it easier to identify items.

# store the object
plt <- plot(fit_2pl, type = "trace", facet_items = FALSE)
# the data we need is in panel.args
# TODO - I had to change the [[1]] to [[2]] since the plt has two panels for some reason, with the one we want being the 2nd panel
plt_data <- tibble(
  x          = plt$panel.args[[2]]$x,
  y          = plt$panel.args[[2]]$y,
  subscripts = plt$panel.args[[2]]$subscripts,
  item       = rep(colnames(item_scores), each = 200)
) %>%
  mutate(
    item_no = str_remove(item, "A") %>% as.numeric(),
    item    = fct_reorder(item, item_no)
    )

head(plt_data)
## # A tibble: 6 x 5
##       x      y subscripts item  item_no
##   <dbl>  <dbl>      <int> <fct>   <dbl>
## 1 -6    0.0252        201 A1          1
## 2 -5.94 0.0271        202 A1          1
## 3 -5.88 0.0292        203 A1          1
## 4 -5.82 0.0315        204 A1          1
## 5 -5.76 0.0339        205 A1          1
## 6 -5.70 0.0365        206 A1          1
plt_gg <- ggplot(plt_data, aes(x, y, 
                          colour = item, 
                          text = item)) + 
  geom_line() + 
  labs(
    title = "2PL - Trace lines",
    #x = expression(theta),
    x = "theta",
    #y = expression(P(theta)),
    y = "P(theta)",
    colour = "Item"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(plt_gg, tooltip = "text")
knitr::knit_exit()